import numpy as np
import nbconvert
import warnings
warnings.simplefilter(action='ignore', category=Warning)
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report,precision_score,recall_score
from sklearn.svm import SVC,SVR
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.manifold import TSNE
from feature_engine.selection import DropConstantFeatures,DropCorrelatedFeatures,DropDuplicateFeatures,SmartCorrelatedSelection
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,f1_score,plot_confusion_matrix,roc_auc_score
warnings.filterwarnings(action='ignore')
from sklearn.tree import DecisionTreeClassifier
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']
print(f"Shape of the dataset:{df.shape}")
Shape of the dataset:(1797, 65)
# Creating a function to create classifcation model and return metrics
def create_classification_model(model_name,model,X_train,X_test,y_train,y_test):
model.fit(X_train,y_train)
y_pred_probs = model.predict_proba(X_test)
y_pred = model.predict(X_test)
if y_train.nunique() > 2:
print(f"Training_accuracy_score: {accuracy_score(y_train,model.predict(X_train))}")
print(f"Testing_accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"f1_score: {f1_score(y_test,y_pred,average='weighted')}")
print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')}")
print("\n---------Classification report----------\n")
print(classification_report(y_test,y_pred))
# classification_report = classification_report(y_test,y_pred)
df = pd.DataFrame({'Model':[model_name],
'Training_accuracy_score':[accuracy_score(y_train,model.predict(X_train))],
'Testing_accuracy_score':[accuracy_score(y_test,y_pred)],
'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')],
'f1_score':[f1_score(y_test,y_pred,average='weighted')]})
return df,model
else:
print(f"Training_accuracy_score: {accuracy_score(y_train,model.predict(X_train))}")
print(f"Testing_accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"f1_score: {f1_score(y_test,y_pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test)[:,1])}")
print("\n---------Classification report----------\n")
print(classification_report(y_test,y_pred))
# classification_report = classification_report(y_test,y_pred)
df = pd.DataFrame({'Model':[model_name],
'Training_accuracy_score':[accuracy_score(y_train,model.predict(X_train))],
'Testing_accuracy_score':[accuracy_score(y_test,y_pred)],
'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test)[:,1])],
'f1_score':[f1_score(y_test,y_pred)]})
return df,model
# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']
# Use the same data splitting as did for week 4 using SVM
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)
# Stabdardising the data
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
# checking the optimal value of K by visualising overfit and underfit of data
test_err = []
train_err = []
neighbors = range(1,20)
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(scaled_X_train,y_train)
y_pred = knn.predict(scaled_X_test)
train_acc = accuracy_score(y_train,knn.predict(scaled_X_train))
test_acc = accuracy_score(y_test,y_pred)
test_err.append(1 - test_acc)
train_err.append(1 - train_acc)
plt.plot(neighbors,test_err,label = 'test_err',marker = '*')
plt.plot(neighbors,train_err,label = 'train_err',marker = '*')
plt.vlines(x = 13,ymin=0,ymax=0.06)
plt.ylabel('Error')
plt.xlabel('Number_of_neighbors')
plt.legend()
<matplotlib.legend.Legend at 0x2283bf54fd0>
from above plot , it is likely to have k (number of neighbors) = 13
# training a default knn model
model_knn = KNeighborsClassifier(n_neighbors=13)
result_knn_df_eucledian,model_knn = create_classification_model('KnearestNeighbor_eucledian',model_knn,
scaled_X_train,
scaled_X_test,
y_train,
y_test)
Training_accuracy_score: 0.9684123025768911
Testing_accuracy_score: 0.968013468013468
f1_score: 0.9679731865847384
roc_auc_score: 0.9962960228882123
---------Classification report----------
precision recall f1-score support
0 1.00 1.00 1.00 59
1 0.91 0.97 0.94 60
2 0.98 0.98 0.98 59
3 0.98 0.98 0.98 60
4 1.00 0.95 0.97 60
5 0.97 0.97 0.97 60
6 0.97 1.00 0.98 60
7 0.95 0.98 0.97 59
8 0.96 0.90 0.93 58
9 0.97 0.95 0.96 59
accuracy 0.97 594
macro avg 0.97 0.97 0.97 594
weighted avg 0.97 0.97 0.97 594
From above report and metrics,we have a decent accuracy as well as good recall score of every class
Now comparing the above results from KNN with below week 4 SVM
# training the digits dataset with week 4 SVM
digits = datasets.load_digits(as_frame=True)
digits_df = digits['frame']
def create_classification_model_4(model_name,model,X_train,X_test,y_train,y_test):
model.fit(X_train,y_train)
y_pred_probs = model.predict_proba(X_test)
y_pred = model.predict(X_test)
# if model_type == 1:
print(f"Training_score: {model.score(X_train,y_train)}")
print(f"Testing_score: {model.score(X_test,y_test)}")
print(f"f1_score: {f1_score(y_test,y_pred,average='weighted')}")
print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')}")
print("\n---------Classification report----------\n")
print(classification_report(y_test,y_pred))
df = pd.DataFrame({'Model':[model_name],
'Training_score':[model.score(X_train,y_train)],
'Testing_score':[model.score(X_test,y_test)],
'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')],
'f1_score':[f1_score(y_test,y_pred,average='weighted')]})
return df,model
# Extrating the column features pixels from the dataset
digits_df_predictors = digits_df.iloc[:,:-1]
digits_df_predictors
# standardising the datset
scaler = StandardScaler()
scaled_digits_df_predictors = scaler.fit_transform(digits_df_predictors)
#applying PCA on the whole dataset predictors
pca_digits = PCA()
pca_transformed_digits_df_predictors = pca_digits.fit_transform(scaled_digits_df_predictors)
pca_transformed_digits_df_predictors
# Taking first 3 components
pca_transformed_digits_df_predictors_df = pd.DataFrame(pca_transformed_digits_df_predictors[:,[0,1,2]],
columns=['pc1','pc2','pc3'])
pca_transformed_digits_df_full = pd.concat([pca_transformed_digits_df_predictors_df,digits_df.iloc[:,-1]],axis = 1)
# pca_transformed_digits_df_full
X = pca_transformed_digits_df_full.iloc[:,0:3]
y = pca_transformed_digits_df_full['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
svc = SVC(probability=True,kernel='rbf',C=3.1,gamma=0.1)
# param_grid = {"C":np.arange(start=0.1, stop=5, step=0.1),
# # "degree":[2,3,4],
# 'gamma' : [1,0.1,0.01,0.001]}
# kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
# grid = GridSearchCV(estimator=svc,param_grid=param_grid,
# cv = kf,return_train_score=True,verbose=1)
SVC_df,grid_svc = create_classification_model_4('SVC',svc,X_train,X_test,y_train,y_test)
Training_score: 0.8003182179793158
Testing_score: 0.7462962962962963
f1_score: 0.7357826318887575
roc_auc_score: 0.9628497633083247
---------Classification report----------
precision recall f1-score support
0 0.89 0.92 0.91 53
1 0.79 0.82 0.80 50
2 0.85 0.85 0.85 47
3 0.73 0.69 0.70 54
4 0.95 0.95 0.95 60
5 0.42 0.20 0.27 66
6 1.00 0.92 0.96 53
7 0.77 0.84 0.80 55
8 0.44 0.63 0.52 43
9 0.59 0.75 0.66 59
accuracy 0.75 540
macro avg 0.74 0.76 0.74 540
weighted avg 0.74 0.75 0.74 540
As we see, SVM did not preform well on overall classes as some of the classes have poor recall and precision score
whereas KNN performs decent on this dataset
Training and testing score of KNN is much better than training and testing score of SVM
SVC_df.rename(columns={'Training_score':'Training_accuracy_score','Testing_score':'Testing_accuracy_score'},
inplace=True)
results = result_knn_df_eucledian.append(SVC_df)
results
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | KnearestNeighbor_eucledian | 0.968412 | 0.968013 | 0.996296 | 0.967973 |
| 0 | SVC | 0.800318 | 0.746296 | 0.962850 | 0.735783 |
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']
# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']
# Splitting the dataset into 50% train and 50% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify=y)
print(f"Shapa of X_train:{X_train.shape}")
print(f"Shapa of y_train:{y_train.shape}")
print(f"Shapa of X_test:{X_test.shape}")
print(f"Shapa of y_test:{y_test.shape}")
# Checking the optimal value for depth of tree by visualizing overfitting and underfitting
test_50_50_err = []
train_50_50_err = []
depth_tree = range(1,20)
for depth in depth_tree:
dt = DecisionTreeClassifier(max_depth=depth,random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
train_acc = accuracy_score(y_train,dt.predict(X_train))
test_acc = accuracy_score(y_test,y_pred)
test_50_50_err.append(1-test_acc)
train_50_50_err.append(1-train_acc)
# Plotting the above underfitting and overfitting
plt.plot(depth_tree,test_50_50_err,label = 'test_50_50_err')
plt.plot(depth_tree,train_50_50_err,label = 'train_50_50_err')
plt.ylabel("Error")
plt.xlabel("depth of tree")
plt.title("Error vs Depth of tree using 50-50 splitting")
plt.legend();
Shapa of X_train:(898, 64) Shapa of y_train:(898,) Shapa of X_test:(899, 64) Shapa of y_test:(899,)
# Splitting the dataset into 70% train and 50% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)
print(f"Shapa of X_train:{X_train.shape}")
print(f"Shapa of y_train:{y_train.shape}")
print(f"Shapa of X_test:{X_test.shape}")
print(f"Shapa of y_test:{y_test.shape}")
test_70_30_err = []
train_70_30_err = []
depth_tree = range(1,20)
for depth in depth_tree:
dt = DecisionTreeClassifier(max_depth=depth,random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
train_acc = accuracy_score(y_train,dt.predict(X_train))
test_acc = accuracy_score(y_test,y_pred)
test_70_30_err.append(1-test_acc)
train_70_30_err.append(1-train_acc)
plt.plot(depth_tree,test_70_30_err,label = 'test_70_30_err')
plt.plot(depth_tree,train_70_30_err,label = 'train_70_30_err')
plt.ylabel("Error")
plt.xlabel("depth of tree using 70-30 splitting")
plt.title("Error vs Depth of tree")
plt.legend();
Shapa of X_train:(1257, 64) Shapa of y_train:(1257,) Shapa of X_test:(540, 64) Shapa of y_test:(540,)
# comparing train/test error vs depth of tree with different train/test split data
plt.plot(depth_tree,test_50_50_err,label = 'test_50_50_err')
plt.plot(depth_tree,train_50_50_err,label = 'train_50_50_err')
plt.plot(depth_tree,test_70_30_err,label = 'test_70_30_err')
plt.plot(depth_tree,train_70_30_err,label = 'train_70_30_err')
plt.vlines(x = 7,ymax=0.9,ymin = 0)
plt.ylabel("Error")
plt.xlabel("depth of tree")
plt.title("Error vs Depth of tree for 50-50 & 70-30 split")
plt.legend();
# plt.legend()
from above graphs,it seems that using both the splitting technique (50-50 & 70-30)->depth_of_tree comes out to be 7
# training a DT algorithm using depth = 7 on 50-50 train-test split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify=y)
print("----Recorded Metrics using DT on 50-50 train-test split---\n")
dt = DecisionTreeClassifier(max_depth=7)
result_df_50_50,model_dt = create_classification_model('DecisionTree_50_50',dt,
X_train,
X_test,
y_train,
y_test)
----Recorded Metrics using DT on 50-50 train-test split---
Training_accuracy_score: 0.9020044543429844
Testing_accuracy_score: 0.8242491657397107
f1_score: 0.8249019275363464
roc_auc_score: 0.9251636635275006
---------Classification report----------
precision recall f1-score support
0 0.98 0.94 0.96 89
1 0.71 0.78 0.74 91
2 0.77 0.78 0.78 88
3 0.84 0.71 0.77 92
4 0.84 0.84 0.84 91
5 0.91 0.87 0.89 91
6 0.95 0.95 0.95 91
7 0.80 0.89 0.84 89
8 0.81 0.74 0.77 87
9 0.68 0.76 0.72 90
accuracy 0.82 899
macro avg 0.83 0.82 0.82 899
weighted avg 0.83 0.82 0.82 899
result_df_50_50
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | DecisionTree_50_50 | 0.902004 | 0.824249 | 0.925164 | 0.824902 |
# training a DT algorithm using depth = 7 on 70-30 train-test split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)
print("----Recorded Metrics using DT on 70-30 train-test split---\n")
dt = DecisionTreeClassifier(max_depth=7)
result_df_70_30,model_dt = create_classification_model('DecisionTree_70_30',dt,
X_train,
X_test,
y_train,
y_test)
----Recorded Metrics using DT on 70-30 train-test split---
Training_accuracy_score: 0.9108989657915673
Testing_accuracy_score: 0.8203703703703704
f1_score: 0.8216944835965356
roc_auc_score: 0.9266031372940701
---------Classification report----------
precision recall f1-score support
0 0.96 0.91 0.93 54
1 0.73 0.73 0.73 55
2 0.80 0.77 0.79 53
3 0.85 0.84 0.84 55
4 0.79 0.85 0.82 54
5 0.92 0.87 0.90 55
6 0.98 0.94 0.96 54
7 0.80 0.83 0.82 54
8 0.66 0.75 0.70 52
9 0.73 0.70 0.72 54
accuracy 0.82 540
macro avg 0.82 0.82 0.82 540
weighted avg 0.82 0.82 0.82 540
df = result_df_50_50.append(result_df_70_30)
df
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | DecisionTree_50_50 | 0.902004 | 0.824249 | 0.925164 | 0.824902 |
| 0 | DecisionTree_70_30 | 0.910899 | 0.820370 | 0.926603 | 0.821694 |
So,from above comparison using dataframe ::--
By using greater data for training we can achieve greater training score
70-30 split and 50-50 split achieved similar metrics in testing, roc_auc & f1_score
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']
# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
# Creating knn using cosine as distance metric
model_knn = KNeighborsClassifier(n_neighbors=17,metric='cosine')
print("----Recorded Metrics using distance metric-cosine for KNN---\n")
result_knn_df_cosine,model_knn = create_classification_model('KnearestNeighbor_cosine',model_knn,
scaled_X_train,
scaled_X_test,
y_train,
y_test)
----Recorded Metrics using distance metric-cosine for KNN---
Training_accuracy_score: 0.9609310058187863
Testing_accuracy_score: 0.9410774410774411
f1_score: 0.9404458620892752
roc_auc_score: 0.9956006984170029
---------Classification report----------
precision recall f1-score support
0 1.00 0.98 0.99 59
1 0.81 0.98 0.89 60
2 0.97 0.97 0.97 59
3 0.98 0.97 0.97 60
4 0.98 0.93 0.96 60
5 0.92 0.95 0.93 60
6 0.98 1.00 0.99 60
7 0.88 0.98 0.93 59
8 1.00 0.72 0.84 58
9 0.95 0.92 0.93 59
accuracy 0.94 594
macro avg 0.95 0.94 0.94 594
weighted avg 0.95 0.94 0.94 594
# using distance metrics as cityblock also known as mahattan distance when p =1
model_knn = KNeighborsClassifier(n_neighbors=17,p=1)
print('---# using distance metrics as cityblock also known as mahattan distance when p =1--\n')
result_knn_df_cityblock,model_knn = create_classification_model('KnearestNeighbor_cityblock',model_knn,
scaled_X_train,
scaled_X_test,
y_train,
y_test)
# grid.fit(scaled_X_train,y_train)
---# using distance metrics as cityblock also known as mahattan distance when p =1--
Training_accuracy_score: 0.9650872817955112
Testing_accuracy_score: 0.9629629629629629
f1_score: 0.9628919894095478
roc_auc_score: 0.9962049031204085
---------Classification report----------
precision recall f1-score support
0 1.00 0.98 0.99 59
1 0.86 0.98 0.91 60
2 1.00 0.98 0.99 59
3 0.98 0.98 0.98 60
4 0.98 0.98 0.98 60
5 1.00 0.95 0.97 60
6 0.98 0.98 0.98 60
7 0.94 1.00 0.97 59
8 0.96 0.83 0.89 58
9 0.95 0.95 0.95 59
accuracy 0.96 594
macro avg 0.97 0.96 0.96 594
weighted avg 0.97 0.96 0.96 594
df = result_knn_df_eucledian.append(result_knn_df_cosine).append(result_knn_df_cityblock)
df
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | KnearestNeighbor_eucledian | 0.968412 | 0.968013 | 0.996296 | 0.967973 |
| 0 | KnearestNeighbor_cosine | 0.960931 | 0.941077 | 0.995601 | 0.940446 |
| 0 | KnearestNeighbor_cityblock | 0.965087 | 0.962963 | 0.996205 | 0.962892 |
ROC_AUC_score for above three models are quite similar
f1_score using cosine distance is less compared to similar performance acheived by cityblock and eucleadian
Similarly Testing_accuracy using cosine distance is less compared to similar performance acheived by cityblock and eucleadian
hr_df = pd.read_csv('HR-Employee-Attrition.csv')
hr_df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
# Checking for null values
hr_df.isna().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
# checking for number of unique values in features
for feature in hr_df.columns:
if hr_df[feature].nunique() < 10:
print(f"{feature} has {hr_df[feature].nunique()} unique values :-\n{hr_df[feature].unique()}\n")
else:
print(f"{feature} has {hr_df[feature].nunique()} unique values\n")
Age has 43 unique values Attrition has 2 unique values :- ['Yes' 'No'] BusinessTravel has 3 unique values :- ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel'] DailyRate has 886 unique values Department has 3 unique values :- ['Sales' 'Research & Development' 'Human Resources'] DistanceFromHome has 29 unique values Education has 5 unique values :- [2 1 4 3 5] EducationField has 6 unique values :- ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree' 'Human Resources'] EmployeeCount has 1 unique values :- [1] EmployeeNumber has 1470 unique values EnvironmentSatisfaction has 4 unique values :- [2 3 4 1] Gender has 2 unique values :- ['Female' 'Male'] HourlyRate has 71 unique values JobInvolvement has 4 unique values :- [3 2 4 1] JobLevel has 5 unique values :- [2 1 3 4 5] JobRole has 9 unique values :- ['Sales Executive' 'Research Scientist' 'Laboratory Technician' 'Manufacturing Director' 'Healthcare Representative' 'Manager' 'Sales Representative' 'Research Director' 'Human Resources'] JobSatisfaction has 4 unique values :- [4 2 3 1] MaritalStatus has 3 unique values :- ['Single' 'Married' 'Divorced'] MonthlyIncome has 1349 unique values MonthlyRate has 1427 unique values NumCompaniesWorked has 10 unique values Over18 has 1 unique values :- ['Y'] OverTime has 2 unique values :- ['Yes' 'No'] PercentSalaryHike has 15 unique values PerformanceRating has 2 unique values :- [3 4] RelationshipSatisfaction has 4 unique values :- [1 4 2 3] StandardHours has 1 unique values :- [80] StockOptionLevel has 4 unique values :- [0 1 3 2] TotalWorkingYears has 40 unique values TrainingTimesLastYear has 7 unique values :- [0 3 2 5 1 4 6] WorkLifeBalance has 4 unique values :- [1 3 2 4] YearsAtCompany has 37 unique values YearsInCurrentRole has 19 unique values YearsSinceLastPromotion has 16 unique values YearsWithCurrManager has 18 unique values
# Checking for class imbalance
hr_df['Attrition'].value_counts(normalize=True) * 100
No 83.877551 Yes 16.122449 Name: Attrition, dtype: float64
The data is imbalanced as number of Yes class is very low compared to No class
# Mapping yes & No classes into numerical format
att = {'Yes':1,'No':0}
hr_df['Attrition'] = hr_df['Attrition'].replace(att)
hr_df['Attrition'].value_counts(normalize=True) * 100
0 83.877551 1 16.122449 Name: Attrition, dtype: float64
hr_df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
hr_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null int64 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(27), object(8) memory usage: 402.1+ KB
# Converting categorical data into numerical format for training
df = pd.get_dummies(hr_df,drop_first=True)
df.head()
| Age | Attrition | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | ... | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Married | MaritalStatus_Single | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| 1 | 49 | 0 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 37 | 1 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 3 | 33 | 0 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 4 | 27 | 0 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 48 columns
hr = df.copy()
X = df.drop('Attrition',axis = 1)
y = df['Attrition']
df.head(1)
| Age | Attrition | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | ... | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Married | MaritalStatus_Single | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
1 rows × 48 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)
print("training the dataset with default Randomforest settings-----\n\nGetting metrics on imbalanced dataset")
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
print(classification_report(y_test,pred))
training the dataset with default Randomforest settings-----
Getting metrics on imbalanced dataset
precision recall f1-score support
0 0.85 0.98 0.91 408
1 0.56 0.13 0.21 78
accuracy 0.84 486
macro avg 0.71 0.55 0.56 486
weighted avg 0.81 0.84 0.80 486
From above report , it is seen that recall score for 1(yes) class is very low because we have imbalance class dataset, we cannot rely on accuracy score
To tackle this situation, We will use SMOTE technique to create synthetic data to increase the minority class samples
Train the model using hyperparameters tuning on the new dataset,then check performace on the new dataset
Last we use this model trained on new synthetical created dataset on original imbalance dataset to check the performance
from imblearn.over_sampling import RandomOverSampler,SMOTE
y.value_counts()
0 1233 1 237 Name: Attrition, dtype: int64
# creating Synthetic data
smote = SMOTE(sampling_strategy='auto',random_state=1)
X_smote, y_smote = smote.fit_resample(X,y)
y.value_counts() , y_smote.value_counts()
(0 1233 1 237 Name: Attrition, dtype: int64, 1 1233 0 1233 Name: Attrition, dtype: int64)
Now Classes are balanced
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.33, random_state=42,stratify=y_smote)
# training a random forest on new dataset using hyperparameter tuning to get hyperparameters
rf_smote = RandomForestClassifier()
param_grid = {'n_estimators':[50,64,100,128,200],
'criterion':['gini','entropy'],
'max_depth':[3,4,5]}
rf_grid_smote = GridSearchCV(estimator=rf_smote,param_grid=param_grid,cv=5,verbose=1,return_train_score=True)
print('Recorded Metrics on synthetic data trained with RandomForest hyperparameters---\n')
result_smote_rf,rf_grid_smote = create_classification_model('RandomForest_smote',rf_grid_smote,
X_train,
X_test,
y_train,
y_test)
Recorded Metrics on synthetic data trained with RandomForest hyperparameters---
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Training_accuracy_score: 0.910411622276029
Testing_accuracy_score: 0.8734643734643734
f1_score: 0.8742368742368742
roc_auc_score: 0.939353693653448
---------Classification report----------
precision recall f1-score support
0 0.88 0.87 0.87 407
1 0.87 0.88 0.87 407
accuracy 0.87 814
macro avg 0.87 0.87 0.87 814
weighted avg 0.87 0.87 0.87 814
result_smote_rf
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | RandomForest_smote | 0.910412 | 0.873464 | 0.939354 | 0.874237 |
print(f"best parameters for RandomForest:\n{rf_grid_smote.best_params_}")
best parameters for RandomForest:
{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}
rf_grid_smote_result = pd.DataFrame(rf_grid_smote.cv_results_)[['param_criterion','param_max_depth','param_n_estimators','params','mean_test_score','mean_train_score','rank_test_score']]
dff = rf_grid_smote_result.sort_values(by = 'rank_test_score')
dff.head()
| param_criterion | param_max_depth | param_n_estimators | params | mean_test_score | mean_train_score | rank_test_score | |
|---|---|---|---|---|---|---|---|
| 27 | entropy | 5 | 100 | {'criterion': 'entropy', 'max_depth': 5, 'n_es... | 0.876506 | 0.913135 | 1 |
| 25 | entropy | 5 | 50 | {'criterion': 'entropy', 'max_depth': 5, 'n_es... | 0.875300 | 0.909352 | 2 |
| 29 | entropy | 5 | 200 | {'criterion': 'entropy', 'max_depth': 5, 'n_es... | 0.874692 | 0.911471 | 3 |
| 13 | gini | 5 | 128 | {'criterion': 'gini', 'max_depth': 5, 'n_estim... | 0.874692 | 0.913590 | 3 |
| 14 | gini | 5 | 200 | {'criterion': 'gini', 'max_depth': 5, 'n_estim... | 0.873485 | 0.911774 | 5 |
# Visualizing the performance fluctuation for different hyperparameter values of RandomForest
import plotly.express as px
df = px.data.tips()
fig = px.line(dff, y="mean_test_score",
x=dff['params'].astype('str'),markers=True,
height=1000,title='Performance fluctuation for different hyperparameter values of RandomForest')
fig.show()
# Using Trained Model on Old Imbalanced Dataset
X = hr.drop('Attrition',axis =1)
y = hr['Attrition']
y_pred_whole = rf_grid_smote.predict(X)
print("Metrics recorded using model trained with hyperparameters on full original imabalnced dataset\n\n ")
print(f"accuracy_score: {accuracy_score(y,y_pred_whole)}")
print(f"f1_score: {f1_score(y,y_pred_whole)}")
print(f"roc_auc_score: {roc_auc_score(y,rf_grid_smote.predict_proba(X)[:,1])}")
print()
print(classification_report(y,y_pred_whole))
rf_classification = classification_report(y,y_pred_whole)
rf_imbalance_df = pd.DataFrame({'Model':['RandomForest_On_Imabalanced'],
'accuracy_score':[accuracy_score(y,y_pred_whole)],
'f1_score':[f1_score(y,y_pred_whole)],
'roc_auc_score':[roc_auc_score(y,rf_grid_smote.predict_proba(X)[:,1])]})
Metrics recorded using model trained with hyperparameters on full original imabalnced dataset
accuracy_score: 0.8564625850340136
f1_score: 0.5720081135902636
roc_auc_score: 0.8684967883896093
precision recall f1-score support
0 0.92 0.91 0.91 1233
1 0.55 0.59 0.57 237
accuracy 0.86 1470
macro avg 0.74 0.75 0.74 1470
weighted avg 0.86 0.86 0.86 1470
From above classifcation report ::--
Recall score for class(1,yes) is improved significantly compared to previously recall score of less that 0.2
X = hr.drop('Attrition',axis = 1)
y = hr['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
pred = gb.predict(X_test)
print("training the dataset with default GradientBoosting settings-----\n\nGetting metrics on imbalanced dataset")
print(classification_report(y_test,pred))
# gb_classification = classification_report(y_test,pred)
training the dataset with default GradientBoosting settings-----
Getting metrics on imbalanced dataset
precision recall f1-score support
0 0.87 0.97 0.92 408
1 0.62 0.27 0.38 78
accuracy 0.86 486
macro avg 0.75 0.62 0.65 486
weighted avg 0.83 0.86 0.83 486
From above report ,Similarly it is seen that recall score for 1(yes) class is very low because we have imbalance class dataset, we cannot rely on accuracy score
To tackle this situation, We will use SMOTE technique to create synthetic data to increase the minority class samples
Train the model using hyperparameters tuning on the new dataset,then check performace on the new dataset
Last we use this model trained on new synthetical created dataset on original imbalance dataset to check the performance
# training a GradientBoosting Algo on new dataset using hyperparameter tuning to get hyperparameters
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.33, random_state=42,stratify=y_smote)
gb_smote = GradientBoostingClassifier()
param_grid = {'n_estimators':[50,100],
'learning_rate':[0.1,0.05,0.2],
'max_depth':[3,4,5]}
gb_grid_smote = GridSearchCV(estimator=gb_smote,param_grid=param_grid,cv=5,verbose=1,return_train_score=True)
# rf_smote.fit(X_train,y_train)
# y_pred = rf_smote.predict(X_test)
# # y_pred[:5]
# print(classification_report(y_test,y_pred))
print('Recorded Metrics on synthetic data trained with GradientBoosting hyperparameters---\n')
result_smote_gb,gb_grid_smote = create_classification_model('GradientBoosting_smote',gb_grid_smote,
X_train,
X_test,
y_train,
y_test)
Recorded Metrics on synthetic data trained with GradientBoosting hyperparameters---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Training_accuracy_score: 1.0
Testing_accuracy_score: 0.9115479115479116
f1_score: 0.9095477386934673
roc_auc_score: 0.9719346328682938
---------Classification report----------
precision recall f1-score support
0 0.89 0.93 0.91 407
1 0.93 0.89 0.91 407
accuracy 0.91 814
macro avg 0.91 0.91 0.91 814
weighted avg 0.91 0.91 0.91 814
print(f"best parameters for GradientBoosting:\n{gb_grid_smote.best_params_}")
best parameters for GradientBoosting:
{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100}
gb_grid_smote_result = pd.DataFrame(gb_grid_smote.cv_results_)[['param_learning_rate','param_max_depth','param_n_estimators','params','mean_test_score','mean_train_score','rank_test_score']]
# gb_grid_smote_result
dff = gb_grid_smote_result.sort_values(by = 'rank_test_score')
dff.head()
| param_learning_rate | param_max_depth | param_n_estimators | params | mean_test_score | mean_train_score | rank_test_score | |
|---|---|---|---|---|---|---|---|
| 15 | 0.2 | 4 | 100 | {'learning_rate': 0.2, 'max_depth': 4, 'n_esti... | 0.912216 | 1.000000 | 1 |
| 16 | 0.2 | 5 | 50 | {'learning_rate': 0.2, 'max_depth': 5, 'n_esti... | 0.911616 | 1.000000 | 2 |
| 17 | 0.2 | 5 | 100 | {'learning_rate': 0.2, 'max_depth': 5, 'n_esti... | 0.909800 | 1.000000 | 3 |
| 3 | 0.1 | 4 | 100 | {'learning_rate': 0.1, 'max_depth': 4, 'n_esti... | 0.909195 | 0.996217 | 4 |
| 14 | 0.2 | 4 | 50 | {'learning_rate': 0.2, 'max_depth': 4, 'n_esti... | 0.907981 | 0.996217 | 5 |
# Visualizing the performance fluctuation for different hyperparameter values of GradientBoosting
import plotly.express as px
df = px.data.tips()
fig = px.line(dff, y="mean_test_score",
x=dff['params'].astype('str'),markers=True,height=1000,
title='Performance fluctuation for different hyperparameter values of GradientBoosting')
fig.show()
# Using Above Trained GradientBoosting Model on Old Imbalanced Dataset
X = hr.drop('Attrition',axis =1)
y = hr['Attrition']
y_pred_whole = gb_grid_smote.predict(X)
print("Metrics recorded using Above Trained GradientBoosting Model with hyperparameters on full original imabalnced dataset\n\n ")
print(f"accuracy_score: {accuracy_score(y,y_pred_whole)}")
print(f"f1_score: {f1_score(y,y_pred_whole)}")
print(f"roc_auc_score: {roc_auc_score(y,gb_grid_smote.predict_proba(X)[:,1])}")
print()
print(classification_report(y,y_pred_whole))
gb_classification = classification_report(y,y_pred_whole)
gb_imbalance_df = pd.DataFrame({'Model':['GradientBoosting_On_Imabalanced'],
'accuracy_score':[accuracy_score(y,y_pred_whole)],
'f1_score':[f1_score(y,y_pred_whole)],
'roc_auc_score':[roc_auc_score(y,gb_grid_smote.predict_proba(X)[:,1])]})
Metrics recorded using Above Trained GradientBoosting Model with hyperparameters on full original imabalnced dataset
accuracy_score: 0.9551020408163265
f1_score: 0.8571428571428572
roc_auc_score: 0.974337915481776
precision recall f1-score support
0 0.97 0.98 0.97 1233
1 0.88 0.84 0.86 237
accuracy 0.96 1470
macro avg 0.92 0.91 0.92 1470
weighted avg 0.95 0.96 0.95 1470
From above classifcation report ::--
Recall score for class(1,yes) is improved significantly compared to previously recall score of less that 0.3 on Imbalanced Data
print('Comparing RandomForest & GradientBoosting trained on Synthetic Created Data\n')
result_smote_rf.append(result_smote_gb)
Comparing RandomForest & GradientBoosting trained on Synthetic Created Data
| Model | Training_accuracy_score | Testing_accuracy_score | roc_auc_score | f1_score | |
|---|---|---|---|---|---|
| 0 | RandomForest_smote | 0.910412 | 0.873464 | 0.939354 | 0.874237 |
| 0 | GradientBoosting_smote | 1.000000 | 0.911548 | 0.971935 | 0.909548 |
From above comparison, GradientBoosting outperforms Random Forest but GradientBoosting seems to overfit the data,
print("Comparison of models trained on original full Imabalnced Data")
result = rf_imbalance_df.append(gb_imbalance_df)
result
Comparison of models trained on original full Imabalnced Data
| Model | accuracy_score | f1_score | roc_auc_score | |
|---|---|---|---|---|
| 0 | RandomForest_On_Imabalanced | 0.856463 | 0.572008 | 0.868497 |
| 0 | GradientBoosting_On_Imabalanced | 0.955102 | 0.857143 | 0.974338 |
print(f"Classification report on Imabalanced Dataset of GradirantBoosting:\n\n{gb_classification}")
Classification report on Imabalanced Dataset of GradirantBoosting:
precision recall f1-score support
0 0.97 0.98 0.97 1233
1 0.88 0.84 0.86 237
accuracy 0.96 1470
macro avg 0.92 0.91 0.92 1470
weighted avg 0.95 0.96 0.95 1470
print(f"Classification report on Imabalanced Dataset of RandomForest:\n\n{rf_classification}")
Classification report on Imabalanced Dataset of RandomForest:
precision recall f1-score support
0 0.92 0.91 0.91 1233
1 0.55 0.59 0.57 237
accuracy 0.86 1470
macro avg 0.74 0.75 0.74 1470
weighted avg 0.86 0.86 0.86 1470
Also, GradientBoosting outperforms RandomForest on Imbalanced Dataset as GradientBoosting has good recall score for class(1,yes) > 0.8, as compared to class(1,yes) < 0.6 of RandomForest
From all the above results, GradientBoosting should be a preffered choice on RandomForest
https://hastie.su.domains/ISLP/ISLP_website.pdf
https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
https://scikit-learn.org/stable/modules/model_evaluation.html
https://www.analyticsvidhya.com/blog/2022/02/a-comprehensive-guide-on-hyperparameter-tuning-and-its-techniques/
https://www.datacamp.com/cheat-sheet
https://feature-engine.trainindata.com/en/1.3.x/user_guide/selection/index.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/
https://imbalanced-learn.org/stable/